In [1]:
import pandas as pd
import numpy as np 
import sys
In [2]:
sys.version
Out[2]:
'3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]'
In [3]:
np.__version__
Out[3]:
'1.21.5'
In [4]:
train = pd.read_csv ('train.csv')
test = pd.read_csv ('test.csv')
In [5]:
train.head(10)
Out[5]:
galactic year galaxy existence expectancy index existence expectancy at birth Gross income per capita Income Index Expected years of education (galactic years) Mean years of education (galactic years) Intergalactic Development Index (IDI) Education Index ... Intergalactic Development Index (IDI), female Intergalactic Development Index (IDI), male Gender Development Index (GDI) Intergalactic Development Index (IDI), female, Rank Intergalactic Development Index (IDI), male, Rank Adjusted net savings Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total Private galaxy capital flows (% of GGP) Gender Inequality Index (GII) y
0 990025 Large Magellanic Cloud (LMC) 0.628657 63.125200 27109.234310 0.646039 8.240543 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.052590
1 990025 Camelopardalis B 0.818082 81.004994 30166.793958 0.852246 10.671823 4.742470 0.833624 0.467873 ... NaN NaN NaN NaN NaN 19.177926 NaN 22.785018 NaN 0.059868
2 990025 Virgo I 0.659443 59.570534 8441.707353 0.499762 8.840316 5.583973 0.469110 0.363837 ... NaN NaN NaN NaN NaN 21.151265 6.534020 NaN NaN 0.050449
3 990025 UGC 8651 (DDO 181) 0.555862 52.333293 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 5.912194 NaN NaN 0.049394
4 990025 Tucana Dwarf 0.991196 81.802464 81033.956906 1.131163 13.800672 13.188907 0.910341 0.918353 ... NaN NaN NaN NaN NaN NaN 5.611753 NaN NaN 0.154247
5 990025 KKh 060 0.824692 63.887135 28409.062695 0.671697 14.062458 9.978597 0.815264 0.796807 ... NaN NaN NaN NaN NaN 40.118699 3.981105 21.012897 NaN 0.052871
6 990025 Grus II 0.657457 68.555326 25648.328827 0.745674 15.434546 10.021786 0.662192 0.743891 ... NaN NaN NaN NaN NaN 12.984996 4.320539 NaN NaN 0.052780
7 990025 UGCA 292 0.951043 75.693397 20935.541513 0.947961 14.862880 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 4.191899 NaN NaN 0.062652
8 990025 Aquarius II 0.657402 61.337084 18231.029378 0.826377 15.173325 6.941815 0.725536 0.647240 ... NaN NaN NaN NaN NaN -4.709357 6.151802 NaN NaN 0.053927
9 990025 Andromeda XI 0.657180 62.554929 16196.125655 0.679096 12.937281 6.529242 0.561520 0.482450 ... NaN NaN NaN NaN NaN 8.731994 6.684401 NaN NaN 0.050588

10 rows × 80 columns

In [6]:
train.shape
Out[6]:
(3865, 80)
In [7]:
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3865 entries, 0 to 3864
Data columns (total 80 columns):
 #   Column                                                                                   Non-Null Count  Dtype  
---  ------                                                                                   --------------  -----  
 0   galactic year                                                                            3865 non-null   int64  
 1   galaxy                                                                                   3865 non-null   object 
 2   existence expectancy index                                                               3864 non-null   float64
 3   existence expectancy at birth                                                            3864 non-null   float64
 4   Gross income per capita                                                                  3837 non-null   float64
 5   Income Index                                                                             3837 non-null   float64
 6   Expected years of education (galactic years)                                             3732 non-null   float64
 7   Mean years of education (galactic years)                                                 3502 non-null   float64
 8   Intergalactic Development Index (IDI)                                                    3474 non-null   float64
 9   Education Index                                                                          3474 non-null   float64
 10  Intergalactic Development Index (IDI), Rank                                              3432 non-null   float64
 11  Population using at least basic drinking-water services (%)                              2021 non-null   float64
 12  Population using at least basic sanitation services (%)                                  2015 non-null   float64
 13  Gross capital formation (% of GGP)                                                       1502 non-null   float64
 14  Population, total (millions)                                                             1271 non-null   float64
 15  Population, urban (%)                                                                    1271 non-null   float64
 16  Mortality rate, under-five (per 1,000 live births)                                       1271 non-null   float64
 17  Mortality rate, infant (per 1,000 live births)                                           1259 non-null   float64
 18  Old age dependency ratio (old age (65 and older) per 100 creatures (ages 15-64))         1264 non-null   float64
 19  Population, ages 15–64 (millions)                                                        1264 non-null   float64
 20  Population, ages 65 and older (millions)                                                 1264 non-null   float64
 21  Life expectancy at birth, male (galactic years)                                          1264 non-null   float64
 22  Life expectancy at birth, female (galactic years)                                        1264 non-null   float64
 23  Population, under age 5 (millions)                                                       1264 non-null   float64
 24  Young age (0-14) dependency ratio (per 100 creatures ages 15-64)                         1264 non-null   float64
 25  Adolescent birth rate (births per 1,000 female creatures ages 15-19)                     1252 non-null   float64
 26  Total unemployment rate (female to male ratio)                                           1237 non-null   float64
 27  Vulnerable employment (% of total employment)                                            1237 non-null   float64
 28  Unemployment, total (% of labour force)                                                  1237 non-null   float64
 29  Employment in agriculture (% of total employment)                                        1237 non-null   float64
 30  Labour force participation rate (% ages 15 and older)                                    1237 non-null   float64
 31  Labour force participation rate (% ages 15 and older), female                            1237 non-null   float64
 32  Employment in services (% of total employment)                                           1237 non-null   float64
 33  Labour force participation rate (% ages 15 and older), male                              1237 non-null   float64
 34  Employment to population ratio (% ages 15 and older)                                     1237 non-null   float64
 35  Jungle area (% of total land area)                                                       1234 non-null   float64
 36  Share of employment in nonagriculture, female (% of total employment in nonagriculture)  1237 non-null   float64
 37  Youth unemployment rate (female to male ratio)                                           1236 non-null   float64
 38  Unemployment, youth (% ages 15–24)                                                       1236 non-null   float64
 39  Mortality rate, female grown up (per 1,000 people)                                       1253 non-null   float64
 40  Mortality rate, male grown up (per 1,000 people)                                         1253 non-null   float64
 41  Infants lacking immunization, red hot disease (% of one-galactic year-olds)              1219 non-null   float64
 42  Infants lacking immunization, Combination Vaccine (% of one-galactic year-olds)          1219 non-null   float64
 43  Gross galactic product (GGP) per capita                                                  1202 non-null   float64
 44  Gross galactic product (GGP), total                                                      1202 non-null   float64
 45  Outer Galaxies direct investment, net inflows (% of GGP)                                 1169 non-null   float64
 46  Exports and imports (% of GGP)                                                           1144 non-null   float64
 47  Share of seats in senate (% held by female)                                              1123 non-null   float64
 48  Natural resource depletion                                                               1132 non-null   float64
 49  Mean years of education, female (galactic years)                                         1140 non-null   float64
 50  Mean years of education, male (galactic years)                                           1138 non-null   float64
 51  Expected years of education, female (galactic years)                                     1109 non-null   float64
 52  Expected years of education, male (galactic years)                                       1108 non-null   float64
 53  Maternal mortality ratio (deaths per 100,000 live births)                                1252 non-null   float64
 54  Renewable energy consumption (% of total final energy consumption)                       1235 non-null   float64
 55  Estimated gross galactic income per capita, male                                         1055 non-null   float64
 56  Estimated gross galactic income per capita, female                                       1055 non-null   float64
 57  Rural population with access to electricity (%)                                          1029 non-null   float64
 58  Domestic credit provided by financial sector (% of GGP)                                  1079 non-null   float64
 59  Population with at least some secondary education, female (% ages 25 and older)          1089 non-null   float64
 60  Population with at least some secondary education, male (% ages 25 and older)            1087 non-null   float64
 61  Gross fixed capital formation (% of GGP)                                                 1074 non-null   float64
 62  Remittances, inflows (% of GGP)                                                          1028 non-null   float64
 63  Population with at least some secondary education (% ages 25 and older)                  1051 non-null   float64
 64  Intergalactic inbound tourists (thousands)                                               995 non-null    float64
 65  Gross enrolment ratio, primary (% of primary under-age population)                       1038 non-null   float64
 66  Respiratory disease incidence (per 100,000 people)                                       896 non-null    float64
 67  Interstellar phone subscriptions (per 100 people)                                        891 non-null    float64
 68  Interstellar Data Net users, total (% of population)                                     872 non-null    float64
 69  Current health expenditure (% of GGP)                                                    867 non-null    float64
 70  Intergalactic Development Index (IDI), female                                            916 non-null    float64
 71  Intergalactic Development Index (IDI), male                                              915 non-null    float64
 72  Gender Development Index (GDI)                                                           914 non-null    float64
 73  Intergalactic Development Index (IDI), female, Rank                                      893 non-null    float64
 74  Intergalactic Development Index (IDI), male, Rank                                        892 non-null    float64
 75  Adjusted net savings                                                                     912 non-null    float64
 76  Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total                941 non-null    float64
 77  Private galaxy capital flows (% of GGP)                                                  874 non-null    float64
 78  Gender Inequality Index (GII)                                                            844 non-null    float64
 79  y                                                                                        3865 non-null   float64
dtypes: float64(78), int64(1), object(1)
memory usage: 2.4+ MB
In [8]:
train.describe().T
Out[8]:
count mean std min 25% 50% 75% max
galactic year 3865.0 1.000709e+06 6945.463143 990025.000000 995006.000000 1000000.000000 1.006009e+06 1.015056e+06
existence expectancy index 3864.0 8.724787e-01 0.162367 0.227890 0.763027 0.907359 9.927599e-01 1.246908e+00
existence expectancy at birth 3864.0 7.679811e+01 10.461654 34.244062 69.961449 78.995101 8.455897e+01 1.002101e+02
Gross income per capita 3837.0 3.163324e+04 18736.378445 -126.906522 20169.118912 26600.768195 3.689863e+04 1.510727e+05
Income Index 3837.0 8.251535e-01 0.194055 0.292001 0.677131 0.827300 9.702946e-01 1.361883e+00
... ... ... ... ... ... ... ... ...
Adjusted net savings 912.0 2.125292e+01 14.258986 -76.741414 15.001028 22.182571 2.913474e+01 6.190364e+01
Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total 941.0 6.443023e+00 4.804873 -1.192011 4.113472 5.309497 6.814577e+00 3.653846e+01
Private galaxy capital flows (% of GGP) 874.0 2.226147e+01 34.342797 -735.186886 17.227899 24.472557 3.174829e+01 9.594124e+01
Gender Inequality Index (GII) 844.0 6.007333e-01 0.205785 0.089092 0.430332 0.624640 7.674039e-01 1.098439e+00
y 3865.0 8.277313e-02 0.063415 0.013036 0.047889 0.057820 8.738930e-02 6.838127e-01

79 rows × 8 columns

In [9]:
train.duplicated().sum()
Out[9]:
0
In [10]:
train.isnull().sum()
Out[10]:
galactic year                                                                   0
galaxy                                                                          0
existence expectancy index                                                      1
existence expectancy at birth                                                   1
Gross income per capita                                                        28
                                                                             ... 
Adjusted net savings                                                         2953
Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total    2924
Private galaxy capital flows (% of GGP)                                      2991
Gender Inequality Index (GII)                                                3021
y                                                                               0
Length: 80, dtype: int64
In [11]:
import missingno as msno
msno.matrix(train)
Out[11]:
<AxesSubplot:>
In [12]:
percent_missing = train.isnull().sum() * 100 / len(train)
missing_value_train = pd.DataFrame({'column_name': train.columns,
                                 'percent_missing': round(percent_missing)})
missing_value_train
Out[12]:
column_name percent_missing
galactic year galactic year 0.0
galaxy galaxy 0.0
existence expectancy index existence expectancy index 0.0
existence expectancy at birth existence expectancy at birth 0.0
Gross income per capita Gross income per capita 1.0
... ... ...
Adjusted net savings Adjusted net savings 76.0
Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total Creature Immunodeficiency Disease prevalence, ... 76.0
Private galaxy capital flows (% of GGP) Private galaxy capital flows (% of GGP) 77.0
Gender Inequality Index (GII) Gender Inequality Index (GII) 78.0
y y 0.0

80 rows × 2 columns

In [13]:
msno.heatmap(train)
Out[13]:
<AxesSubplot:>
In [14]:
msno.bar(train)
Out[14]:
<AxesSubplot:>
In [15]:
import seaborn as sns
sns.distplot(train['y'].dropna(), kde=True, bins=20, color='darkblue')
C:\Users\User\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
Out[15]:
<AxesSubplot:xlabel='y', ylabel='Density'>
In [16]:
train.corr().iloc[:,[-1]]
Out[16]:
y
galactic year 0.019264
existence expectancy index 0.547397
existence expectancy at birth 0.555757
Gross income per capita 0.508029
Income Index 0.579969
... ...
Adjusted net savings 0.164614
Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total -0.231993
Private galaxy capital flows (% of GGP) -0.130659
Gender Inequality Index (GII) -0.722279
y 1.000000

79 rows × 1 columns

In [17]:
import matplotlib.pyplot as plt

plt.figure(figsize=(20,20))

mask=np.triu(np.ones_like(train.corr(),dtype=np.bool))
heatmap=sns.heatmap(round(train.corr(),1), mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('correlation heatmap',fontdict={'fontsize': 20},pad=20);
C:\Users\User\AppData\Local\Temp\ipykernel_12168\1061572248.py:5: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  mask=np.triu(np.ones_like(train.corr(),dtype=np.bool))
In [18]:
plt.figure(figsize=(12,8))

heatmap=sns.heatmap(round(train.corr()[['y']]).sort_values(by='y', ascending=False),
                    vmin=-1,vmax=1,annot=True, cmap='BrBG')
heatmap.set_title('correlating with Y',fontdict={'fontsize': 12},pad=6);
In [19]:
train.corr().y.sort_values(ascending=False)
Out[19]:
y                                                                                   1.000000
Old age dependency ratio (old age (65 and older) per 100 creatures (ages 15-64))    0.679981
Estimated gross galactic income per capita, female                                  0.667465
Interstellar Data Net users, total (% of population)                                0.651823
Intergalactic Development Index (IDI)                                               0.625114
                                                                                      ...   
Young age (0-14) dependency ratio (per 100 creatures ages 15-64)                   -0.533741
Intergalactic Development Index (IDI), female, Rank                                -0.664882
Intergalactic Development Index (IDI), male, Rank                                  -0.680577
Intergalactic Development Index (IDI), Rank                                        -0.681592
Gender Inequality Index (GII)                                                      -0.722279
Name: y, Length: 79, dtype: float64
In [20]:
train.columns
Out[20]:
Index(['galactic year', 'galaxy', 'existence expectancy index',
       'existence expectancy at birth', 'Gross income per capita',
       'Income Index', 'Expected years of education (galactic years)',
       'Mean years of education (galactic years)',
       'Intergalactic Development Index (IDI)', 'Education Index',
       'Intergalactic Development Index (IDI), Rank',
       'Population using at least basic drinking-water services (%)',
       'Population using at least basic sanitation services (%)',
       'Gross capital formation (% of GGP)', 'Population, total (millions)',
       'Population, urban (%)',
       'Mortality rate, under-five (per 1,000 live births)',
       'Mortality rate, infant (per 1,000 live births)',
       'Old age dependency ratio (old age (65 and older) per 100 creatures (ages 15-64))',
       'Population, ages 15–64 (millions)',
       'Population, ages 65 and older (millions)',
       'Life expectancy at birth, male (galactic years)',
       'Life expectancy at birth, female (galactic years)',
       'Population, under age 5 (millions)',
       'Young age (0-14) dependency ratio (per 100 creatures ages 15-64)',
       'Adolescent birth rate (births per 1,000 female creatures ages 15-19)',
       'Total unemployment rate (female to male ratio)',
       'Vulnerable employment (% of total employment)',
       'Unemployment, total (% of labour force)',
       'Employment in agriculture (% of total employment)',
       'Labour force participation rate (% ages 15 and older)',
       'Labour force participation rate (% ages 15 and older), female',
       'Employment in services (% of total employment)',
       'Labour force participation rate (% ages 15 and older), male',
       'Employment to population ratio (% ages 15 and older)',
       'Jungle area (% of total land area)',
       'Share of employment in nonagriculture, female (% of total employment in nonagriculture)',
       'Youth unemployment rate (female to male ratio)',
       'Unemployment, youth (% ages 15–24)',
       'Mortality rate, female grown up (per 1,000 people)',
       'Mortality rate, male grown up (per 1,000 people)',
       'Infants lacking immunization, red hot disease (% of one-galactic year-olds)',
       'Infants lacking immunization, Combination Vaccine (% of one-galactic year-olds)',
       'Gross galactic product (GGP) per capita',
       'Gross galactic product (GGP), total',
       'Outer Galaxies direct investment, net inflows (% of GGP)',
       'Exports and imports (% of GGP)',
       'Share of seats in senate (% held by female)',
       'Natural resource depletion',
       'Mean years of education, female (galactic years)',
       'Mean years of education, male (galactic years)',
       'Expected years of education, female (galactic years)',
       'Expected years of education, male (galactic years)',
       'Maternal mortality ratio (deaths per 100,000 live births)',
       'Renewable energy consumption (% of total final energy consumption)',
       'Estimated gross galactic income per capita, male',
       'Estimated gross galactic income per capita, female',
       'Rural population with access to electricity (%)',
       'Domestic credit provided by financial sector (% of GGP)',
       'Population with at least some secondary education, female (% ages 25 and older)',
       'Population with at least some secondary education, male (% ages 25 and older)',
       'Gross fixed capital formation (% of GGP)',
       'Remittances, inflows (% of GGP)',
       'Population with at least some secondary education (% ages 25 and older)',
       'Intergalactic inbound tourists (thousands)',
       'Gross enrolment ratio, primary (% of primary under-age population)',
       'Respiratory disease incidence (per 100,000 people)',
       'Interstellar phone subscriptions (per 100 people)',
       'Interstellar Data Net users, total (% of population)',
       'Current health expenditure (% of GGP)',
       'Intergalactic Development Index (IDI), female',
       'Intergalactic Development Index (IDI), male',
       'Gender Development Index (GDI)',
       'Intergalactic Development Index (IDI), female, Rank',
       'Intergalactic Development Index (IDI), male, Rank',
       'Adjusted net savings ',
       'Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total',
       'Private galaxy capital flows (% of GGP)',
       'Gender Inequality Index (GII)', 'y'],
      dtype='object')
In [21]:
len(train['galactic year'].unique())
Out[21]:
26
In [22]:
len(train['galaxy'].unique())
Out[22]:
181
In [23]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=50)

data_train = imputer.fit_transform(train.drop(['galaxy', 'y'], axis=1))
data_train = pd.DataFrame(data=data_train, columns=train.drop(['galaxy', 'y'], axis=1).columns)
data_train['galaxy'] = train['galaxy']
data_train['y'] = train['y']

data_test = imputer.transform(test.drop(['galaxy'], axis=1))
data_test = pd.DataFrame(data=data_test, columns=test.drop(['galaxy'], axis=1).columns)
data_test['galaxy'] = test['galaxy']
In [24]:
data_train.corrwith(train['y']).sort_values(ascending=False)
Out[24]:
y                                                                                   1.000000
Old age dependency ratio (old age (65 and older) per 100 creatures (ages 15-64))    0.686479
Intergalactic Development Index (IDI), male                                         0.629423
Intergalactic Development Index (IDI)                                               0.627634
Intergalactic Development Index (IDI), female                                       0.623749
                                                                                      ...   
Young age (0-14) dependency ratio (per 100 creatures ages 15-64)                   -0.574097
Intergalactic Development Index (IDI), female, Rank                                -0.652907
Intergalactic Development Index (IDI), male, Rank                                  -0.659552
Intergalactic Development Index (IDI), Rank                                        -0.676616
Gender Inequality Index (GII)                                                      -0.692676
Length: 79, dtype: float64
In [25]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
In [27]:
scaled_train = scaler.fit_transform(data_train.drop(['galaxy', 'y'], axis=1))
scaled_train = pd.DataFrame(data=scaled_train, columns=data_train.drop(['galaxy', 'y'], axis=1).columns)
scaled_train['galaxy'] = train['galaxy']
scaled_train['y'] = train['y']

scaled_test = scaler.transform(data_test.drop(['galaxy'], axis=1))
scaled_test = pd.DataFrame(data=scaled_test, columns=data_test.drop(['galaxy'], axis=1).columns)
scaled_test['galaxy'] = test['galaxy']
In [28]:
data_train.drop(['galaxy', 'y'], axis=1)
Out[28]:
galactic year existence expectancy index existence expectancy at birth Gross income per capita Income Index Expected years of education (galactic years) Mean years of education (galactic years) Intergalactic Development Index (IDI) Education Index Intergalactic Development Index (IDI), Rank ... Current health expenditure (% of GGP) Intergalactic Development Index (IDI), female Intergalactic Development Index (IDI), male Gender Development Index (GDI) Intergalactic Development Index (IDI), female, Rank Intergalactic Development Index (IDI), male, Rank Adjusted net savings Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total Private galaxy capital flows (% of GGP) Gender Inequality Index (GII)
0 990025.0 0.628657 63.125200 27109.234310 0.646039 8.240543 9.414592 0.757927 0.667371 122.773739 ... 7.739157 0.683139 0.729316 0.978695 124.948040 126.732398 21.868164 5.291919 25.865679 0.725487
1 990025.0 0.818082 81.004994 30166.793958 0.852246 10.671823 4.742470 0.833624 0.467873 152.522198 ... 7.887836 0.779520 0.798319 1.008465 115.511324 114.885812 19.177926 5.422150 22.785018 0.684028
2 990025.0 0.659443 59.570534 8441.707353 0.499762 8.840316 5.583973 0.469110 0.363837 209.813266 ... 7.754494 0.596880 0.664201 0.937516 144.262043 141.635970 21.151265 6.534020 27.217467 0.805381
3 990025.0 0.555862 52.333293 19122.436285 0.640748 9.707928 6.477055 0.598840 0.488088 161.350127 ... 7.480740 0.606570 0.658516 0.939025 133.218633 131.232782 17.613850 5.912194 27.108045 0.785974
4 990025.0 0.991196 81.802464 81033.956906 1.131163 13.800672 13.188907 0.910341 0.918353 71.885345 ... 10.900723 1.030851 1.042364 1.046897 64.204582 57.884916 28.862055 5.611753 21.926318 0.337437
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3860 1015056.0 1.029704 82.832063 34310.471408 0.855094 18.578586 10.557143 0.906573 0.862826 144.896214 ... 10.392312 0.943410 0.902237 1.060532 124.564121 143.907576 26.438719 3.023709 29.294865 0.580785
3861 1015056.0 0.937869 75.877098 36899.067719 0.929494 16.153857 9.151665 0.865822 0.747577 164.692000 ... 10.296360 0.915225 0.798083 1.055118 163.664516 184.291155 20.637654 4.470596 31.085400 0.517558
3862 1015056.0 1.036144 93.540275 37002.977875 1.085245 21.066473 16.661344 0.983835 1.100779 63.726437 ... 9.601421 1.097208 1.044890 1.114754 66.498714 112.887035 28.154859 5.193997 32.145570 0.363862
3863 1015056.0 0.939034 78.274427 28180.459770 0.687655 9.388911 8.908748 0.735694 0.602703 216.805701 ... 4.137744 0.596164 0.754729 0.825864 182.249079 175.408953 38.963157 2.854140 27.227179 0.711878
3864 1015056.0 1.032244 91.641356 73109.215949 1.207746 18.910920 16.202486 1.171634 1.085080 63.924650 ... 18.252986 1.018083 1.099254 1.032783 57.204155 75.434029 23.337587 4.442307 29.957851 0.583706

3865 rows × 78 columns

In [29]:
data_test.drop(['galaxy'], axis=1)
Out[29]:
galactic year existence expectancy index existence expectancy at birth Gross income per capita Income Index Expected years of education (galactic years) Mean years of education (galactic years) Intergalactic Development Index (IDI) Education Index Intergalactic Development Index (IDI), Rank ... Current health expenditure (% of GGP) Intergalactic Development Index (IDI), female Intergalactic Development Index (IDI), male Gender Development Index (GDI) Intergalactic Development Index (IDI), female, Rank Intergalactic Development Index (IDI), male, Rank Adjusted net savings Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total Private galaxy capital flows (% of GGP) Gender Inequality Index (GII)
0 1007012.0 0.456086 51.562543 12236.576447 0.593325 10.414164 10.699072 0.547114 0.556267 232.621842 ... 8.589593 0.642279 0.700928 0.949454 170.251936 170.856791 14.631003 8.297189 21.069585 0.764036
1 1007012.0 0.529835 57.228262 3431.883825 0.675407 7.239485 5.311122 0.497688 0.409969 247.580771 ... 8.640996 0.640932 0.696880 0.958519 168.373447 171.729122 12.400737 8.338754 21.081108 0.777762
2 1008016.0 0.560976 59.379539 27562.914252 0.594624 11.774890 5.937797 0.544744 0.486167 249.798771 ... 8.461854 0.753123 0.796017 0.981746 146.227460 144.922929 19.998325 6.696963 21.566463 0.705869
3 1007012.0 0.565910 59.952390 20352.232905 0.837700 11.613621 10.067882 0.691641 0.523441 211.505060 ... 8.930545 0.691369 0.715762 0.972102 161.002454 158.023342 18.510995 7.182049 24.003704 0.756517
4 1013042.0 0.588274 55.428320 23959.704016 0.520579 10.392416 6.374637 0.530676 0.580418 234.721069 ... 7.357729 0.583373 0.600445 0.856158 206.674424 224.104054 20.009451 7.687626 23.553654 0.694438
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
885 1016064.0 0.936349 80.480709 34642.600864 0.874296 16.510144 11.562553 0.860859 0.840518 144.951511 ... 9.655038 0.862604 0.890035 1.025992 125.250495 127.009381 18.533047 6.150826 22.979806 0.570944
886 1016064.0 0.936349 80.480709 34642.600864 0.874296 16.510144 11.562553 0.860859 0.840518 144.951511 ... 9.655038 0.862604 0.890035 1.025992 125.250495 127.009381 18.533047 6.150826 22.979806 0.570944
887 1016064.0 0.936349 80.480709 34642.600864 0.874296 16.510144 11.562553 0.860859 0.840518 144.951511 ... 9.655038 0.862604 0.890035 1.025992 125.250495 127.009381 18.533047 6.150826 22.979806 0.570944
888 1016064.0 0.936349 80.480709 34642.600864 0.874296 16.510144 11.562553 0.860859 0.840518 144.951511 ... 9.655038 0.862604 0.890035 1.025992 125.250495 127.009381 18.533047 6.150826 22.979806 0.570944
889 1016064.0 0.936349 80.480709 34642.600864 0.874296 16.510144 11.562553 0.860859 0.840518 144.951511 ... 9.655038 0.862604 0.890035 1.025992 125.250495 127.009381 18.533047 6.150826 22.979806 0.570944

890 rows × 78 columns

In [30]:
data_train = pd.concat([data_train, pd.get_dummies(data_train['galaxy'])], axis=1).drop(['galaxy'], axis=1)
data_test = pd.concat([data_test, pd.get_dummies(data_test['galaxy'])], axis=1).drop(['galaxy'], axis=1)

scaled_train = pd.concat([scaled_train, pd.get_dummies(scaled_train['galaxy'])], axis=1).drop(['galaxy'], axis=1)
scaled_test = pd.concat([scaled_test, pd.get_dummies(scaled_test['galaxy'])], axis=1).drop(['galaxy'], axis=1)
In [31]:
data_train.dropna(inplace=True)
data_test.dropna(inplace=True)

scaled_train.dropna(inplace=True)
scaled_test.dropna(inplace=True)
In [32]:
data_train
Out[32]:
galactic year existence expectancy index existence expectancy at birth Gross income per capita Income Index Expected years of education (galactic years) Mean years of education (galactic years) Intergalactic Development Index (IDI) Education Index Intergalactic Development Index (IDI), Rank ... UGCA 292 UGCA 438 (ESO 407-018) UGCA 86 UGCA 92 Ursa Major I Dwarf (UMa I dSph) Ursa Major II Dwarf Ursa Minor Dwarf Virgo I Willman 1 Wolf-Lundmark-Melotte (WLM, DDO 221)
0 990025.0 0.628657 63.125200 27109.234310 0.646039 8.240543 9.414592 0.757927 0.667371 122.773739 ... 0 0 0 0 0 0 0 0 0 0
1 990025.0 0.818082 81.004994 30166.793958 0.852246 10.671823 4.742470 0.833624 0.467873 152.522198 ... 0 0 0 0 0 0 0 0 0 0
2 990025.0 0.659443 59.570534 8441.707353 0.499762 8.840316 5.583973 0.469110 0.363837 209.813266 ... 0 0 0 0 0 0 0 1 0 0
3 990025.0 0.555862 52.333293 19122.436285 0.640748 9.707928 6.477055 0.598840 0.488088 161.350127 ... 0 0 0 0 0 0 0 0 0 0
4 990025.0 0.991196 81.802464 81033.956906 1.131163 13.800672 13.188907 0.910341 0.918353 71.885345 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3860 1015056.0 1.029704 82.832063 34310.471408 0.855094 18.578586 10.557143 0.906573 0.862826 144.896214 ... 0 0 0 0 0 0 0 0 0 0
3861 1015056.0 0.937869 75.877098 36899.067719 0.929494 16.153857 9.151665 0.865822 0.747577 164.692000 ... 0 0 0 0 0 0 0 0 0 0
3862 1015056.0 1.036144 93.540275 37002.977875 1.085245 21.066473 16.661344 0.983835 1.100779 63.726437 ... 0 0 0 0 0 0 0 0 0 0
3863 1015056.0 0.939034 78.274427 28180.459770 0.687655 9.388911 8.908748 0.735694 0.602703 216.805701 ... 0 0 0 0 0 0 0 0 0 0
3864 1015056.0 1.032244 91.641356 73109.215949 1.207746 18.910920 16.202486 1.171634 1.085080 63.924650 ... 0 0 0 0 0 0 0 0 0 0

3865 rows × 260 columns

In [33]:
data_test
Out[33]:
galactic year existence expectancy index existence expectancy at birth Gross income per capita Income Index Expected years of education (galactic years) Mean years of education (galactic years) Intergalactic Development Index (IDI) Education Index Intergalactic Development Index (IDI), Rank ... UGCA 292 UGCA 438 (ESO 407-018) UGCA 86 UGCA 92 Ursa Major I Dwarf (UMa I dSph) Ursa Major II Dwarf Ursa Minor Dwarf Virgo I Willman 1 Wolf-Lundmark-Melotte (WLM, DDO 221)
0 1007012.0 0.456086 51.562543 12236.576447 0.593325 10.414164 10.699072 0.547114 0.556267 232.621842 ... 0 0 0 0 0 0 0 0 0 0
1 1007012.0 0.529835 57.228262 3431.883825 0.675407 7.239485 5.311122 0.497688 0.409969 247.580771 ... 0 0 0 0 0 0 0 0 0 0
2 1008016.0 0.560976 59.379539 27562.914252 0.594624 11.774890 5.937797 0.544744 0.486167 249.798771 ... 0 0 0 0 0 0 0 0 0 0
3 1007012.0 0.565910 59.952390 20352.232905 0.837700 11.613621 10.067882 0.691641 0.523441 211.505060 ... 0 0 0 0 0 0 0 0 0 0
4 1013042.0 0.588274 55.428320 23959.704016 0.520579 10.392416 6.374637 0.530676 0.580418 234.721069 ... 0 0 0 0 0 0 0 1 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
885 1016064.0 0.936349 80.480709 34642.600864 0.874296 16.510144 11.562553 0.860859 0.840518 144.951511 ... 0 0 0 0 0 0 0 0 0 0
886 1016064.0 0.936349 80.480709 34642.600864 0.874296 16.510144 11.562553 0.860859 0.840518 144.951511 ... 0 0 0 0 0 0 0 0 0 0
887 1016064.0 0.936349 80.480709 34642.600864 0.874296 16.510144 11.562553 0.860859 0.840518 144.951511 ... 0 0 0 0 0 0 0 0 0 0
888 1016064.0 0.936349 80.480709 34642.600864 0.874296 16.510144 11.562553 0.860859 0.840518 144.951511 ... 0 0 0 0 0 0 0 0 0 0
889 1016064.0 0.936349 80.480709 34642.600864 0.874296 16.510144 11.562553 0.860859 0.840518 144.951511 ... 0 0 0 0 0 0 0 0 0 0

890 rows × 250 columns

In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_train.drop(['y'], axis=1), data_train['y'], test_size=0.2, random_state=23)

X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(scaled_train.drop(['y'], axis=1), scaled_train['y'], test_size=0.2, random_state=23)
In [38]:
from sklearn.linear_model import LinearRegression
model_lr = LinearRegression()
In [40]:
model_lr.fit(X_train_scaled, y_train_scaled)
pred_lr = model_lr.predict(X_test_scaled)
In [50]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
lr_rmse = np.sqrt(mean_squared_error(y_test_scaled, pred_lr))
r2_lr = r2_score(y_test_scaled, pred_lr)
print('RMSE: %f' % lr_rmse, '\n' 'r2 score: %f' % r2_lr)
RMSE: 0.015308 
r2 score: 0.940908
In [51]:
from sklearn.ensemble import RandomForestRegressor
model_rf = RandomForestRegressor()
model_rf.fit(X_train, y_train)

pred_rf = model_rf.predict(X_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, pred_rf))
r2_rf = r2_score(y_test, pred_rf)
print('RMSE: %f' % rf_rmse, '\n' 'r2 score: %f' % r2_rf)
RMSE: 0.021045 
r2 score: 0.888313
In [48]:
from sklearn.svm import SVR
model_svr = SVR()
model_svr.fit(X_train_scaled, y_train_scaled)

pred_svr = model_svr.predict(X_test_scaled)
svr_rmse = np.sqrt(mean_squared_error(y_test_scaled, pred_svr))
r2_svr = r2_score(y_test_scaled, pred_svr)
print('RMSE: %f' % svr_rmse, '\n' 'r2 score: %f' % r2_svr)
RMSE: 0.061497 
r2 score: 0.046317